library(tidyverse)
library(ggplot2)
library(dplyr)
library(broom)
library(countrycode)
library(tidytext)
library(viridis)
library(rworldmap)
library(ggmap)
library(maps)
library(sp)
library(maptools)
library(readr)
library(ggraph)
library(igraph)
library(RColorBrewer)
library(gganimate)
library(gifski)
library(ggalluvial)
library(hrbrthemes)
library(kableExtra)
library(leaflet)
library(scales)
library(grid)
library(gridExtra)
data <- read_csv("data/athlete_events.csv")
dat1 <- read_csv("data/countries.csv")
Medal distribution by country
medal <- data %>%
filter(!is.na(Medal))
team<- medal %>%
group_by(NOC) %>%
summarise(Gold = sum (Medal == "Gold"),
Silver = sum (Medal == "Silver"),
Bronze = sum (Medal == "Bronze"))
team_M <- team %>%
pivot_longer(!"NOC", names_to = "medals", values_to = "number_of_medals")
country_M <- countrycode(team_M$NOC, "ioc", "country.name")
country_M <- data_frame(country_M, team_M)
country_M<- na.omit(country_M)
team_T <- cbind(team, c(rowSums(team [, 2:4], na.rm = TRUE)))
colnames(team_T)[5] <- "Total"
country_T <- countrycode(team_T$NOC, "ioc", "country.name")
country_T <- data_frame(country_T, team_T)
country_T<- na.omit(country_T)
country_T <- country_T %>%
arrange(desc(Total)) %>%
rename("Country" = country_T)
country_T_10 <- country_T%>%
top_n(10)
knitr::kable(country_T_10, caption = "The medals in different country", col.names = c("Country", "NOC", "Gold", "Silver", "Bronze", "Total"))
| Country | NOC | Gold | Silver | Bronze | Total |
|---|---|---|---|---|---|
| United States | USA | 2638 | 1641 | 1358 | 5637 |
| Germany | GER | 745 | 674 | 746 | 2165 |
| United Kingdom | GBR | 678 | 739 | 651 | 2068 |
| France | FRA | 501 | 610 | 666 | 1777 |
| Italy | ITA | 575 | 531 | 531 | 1637 |
| Sweden | SWE | 479 | 522 | 535 | 1536 |
| Canada | CAN | 463 | 438 | 451 | 1352 |
| Australia | AUS | 348 | 455 | 517 | 1320 |
| Russia | RUS | 390 | 367 | 408 | 1165 |
| Hungary | HUN | 432 | 332 | 371 | 1135 |
ct <- inner_join(country_T, dat1, by = "Country")
world <- map_data("world")
visit.x<-ct$Longitude
visit.y<-ct$Latitude
hex_codes <- hue_pal(h=c(180,270)) (length(ct$Country))
pal <- colorFactor(hex_codes, domain = ct$Country)
mytext <- paste(
"Country: ", ct$Country,
"Total: ", ct$Total,
"Gold: ", ct$Gold,
"Silver: ", ct$Silver,
"Bronze: ", ct$Bronze) %>%
lapply(htmltools::HTML)
map <- leaflet(ct) %>%
addTiles() %>%
addCircles(
lng = ~Longitude,
lat = ~Latitude,
radius = ~Total*300,
stroke = F,
fillOpacity = 0.4,
color = ~pal(Country),
label = mytext
) %>%
addLegend ("topright",
pal=pal,
values = ~Country,
title = "Country",
opacity = 1
)
map
The map shows the number of medals won by each country in the world at the Olympic Games, including gold, silver, bronze and total. The size of the circles on the map indicates the number of medals won, so it is easy to see that the USA has the most medals. Europe has the highest number of medals, and the density of the circles shows that most European countries have won medals and have accumulated a significant number of medals in total.
Find out which sport has the largest number of participants and study the distribution of gold MEDALS in different countries over time
sport_count<- data %>%
mutate(Number_of_people_in_each_sport = Sport) %>%
count(Number_of_people_in_each_sport) %>%
top_n(10)
sport_count %>%
mutate(Number_of_people_in_each_sport = fct_reorder(Number_of_people_in_each_sport, n)) %>%
ggplot(aes(x = Number_of_people_in_each_sport,
y = n,
fill = Number_of_people_in_each_sport)) +
coord_flip() +
geom_text(aes(x = Number_of_people_in_each_sport,
y = n + 4000,
label = n)) +
geom_col() +
xlab("Sport") +
ylab("Number of people in each sport") +
theme(strip.text = element_text(size = 10),
axis.text = element_text(size = 10),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 0.15),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15))
Figure 0.1: Top 10 sports with the most athletes
With the Figure 0.1, It illustrate the number of participants in different sports, and select the top 10 of them, according to the figure, the most popular sports is Athletics which is 38,624. the second is gymnastics and the third one is swimming. the fine thing about this figure is that the number of participate athletes of first 3 sport is far more than any other sports.
Athletics_1 <- data %>%
filter(Sport == "Athletics" & Medal == "Gold")%>%
select(Sport, NOC, Medal, Year) %>%
cbind(1) %>%
rename("Number" = "1") %>%
group_by(NOC, Year) %>%
summarise(Total = sum(Number)) %>%
mutate(Accumu = cumsum(Total))
Athletics <- inner_join(Athletics_1, country_T, by="NOC")
ranking <- Athletics %>%
group_by(Year) %>%
mutate(rank = rank(-Accumu),
Accumu_rel = Accumu/Accumu[rank==1],
Accumu_lbl = paste0(" ",round(Accumu/1e9))) %>%
group_by(Country) %>%
filter(rank <=5) %>%
ungroup()
ggp <- ggplot(ranking,
aes(x = rank,
y = Accumu,
group = Country)) +
geom_bar(stat = "identity",
aes(fill = Country)) +
transition_states(Year, transition_length = 2, state_length = 0) +
ease_aes('quadratic-in-out') +
enter_drift(x_mod = -1) + exit_drift(x_mod = 1) +
labs(x = "Ranking of the gold medal accumulated",
y = "Accumulated Number of gold medal",
title = "Year {closest_state}")
animate(ggp, 200, fps = 20, width = 1200, height = 1000,
renderer = gifski_renderer("gganim.gif"))
Figure 0.2: In the Athletics sports, the ranking of first 5 contries about Total number of Cumulative gold MEDALS won
Then we focus on the athletics sports, the above Figure 0.2 describes only in the Athletics sports, it shows number of Cumulative gold MEDALS won in each country and ranking them. The x-axis is about first 5 ranking, the y-axis is about the Accumulated Number of gold medal, and the different color means different countries, so there are 2 interesting finding in this GIF,
The American is always number one in the ranking list except year 1980. That’s because the USA did not join the sport meet in that year to boycott the former Soviet Union.
Sometimes the bar overlapped on the x-axis, which means it shares the same ranking place in this year.
How about the Medals of top 5 countries allocated in the events (Ziang_Li)
In the section 1, we discussed the number of medals each country has won in the previous Olympic Games. In section2, we analyzed the sports with the largest number of athletes participate in. Hence, in section3, we will compare the five countries with the most medals and the distribution of medals in the five events with the largest number of participants.
From the figure, it clearly shows that the Medals changing of the Top5 countries in previous Olympic Games. The United States has consistently ranked first in the total number of medals. After 1990, the number of medals in Germany has increased rapidly. And in 1998, Germany ranked second in the total number of medals.
From the figure, it shows the distribution of the top5 countries medals among the 5 sports with the most participants. The number of medals in the United States is larger than the other four countries and The United States has an absolute advantage in swimming events, especially in the relay race events. The medals of United Kingdom are more distributed in Cycling. In addition, the medals from other countries are very evenly distributed among the five events
top1 <- country_T %>%
top_n(5)
aa <- sport_count %>%
arrange(desc(n)) %>%
top_n(5)
ttt <- top1 %>%
inner_join(data, top1, by="NOC") %>%
filter (!is.na (Medal)) %>%
filter ( Sport == "Athletics" | Sport == "Gymnastics" | Sport == "Swimming" | Sport == "Shooting" | Sport == "Cycling") %>%
select (Country, Sport, Event, Medal) %>%
group_by(Country, Sport, Event, Medal) %>%
tally() %>%
ungroup() %>%
group_by(Country, Sport, Event) %>%
summarise( n = sum(n))
Athletics <- ttt %>%
ungroup() %>%
filter(Sport == "Athletics") %>%
group_by(Event) %>%
summarise( tot = sum(n)) %>%
arrange(desc(tot)) %>%
top_n(5)
Cycling <- ttt %>%
ungroup() %>%
filter(Sport == "Cycling") %>%
group_by(Event) %>%
summarise( tot = sum(n)) %>%
arrange(desc(tot)) %>%
top_n(5)
Gymnastics <- ttt %>%
ungroup() %>%
filter(Sport == "Gymnastics") %>%
group_by(Event) %>%
summarise( tot = sum(n)) %>%
arrange(desc(tot)) %>%
top_n(5)
Shooting <- ttt %>%
ungroup() %>%
filter(Sport == "Shooting") %>%
group_by(Event) %>%
summarise( tot = sum(n)) %>%
arrange(desc(tot)) %>%
top_n(5)
Swimming <- ttt %>%
ungroup() %>%
filter(Sport == "Swimming") %>%
group_by(Event) %>%
summarise( tot = sum(n)) %>%
arrange(desc(tot)) %>%
top_n(5)
rrr <- rbind(Athletics, Cycling, Gymnastics, Shooting, Swimming)
ooo <- left_join(rrr, ttt, by="Event")
qqq <- data %>%
filter(!is.na(Medal)) %>%
select (NOC, Year, Medal)%>%
cbind (1) %>%
rename( "Number" = "1") %>%
select(NOC, Year, Number)
qqq <- left_join(qqq, country_T)
www <- qqq %>%
select(Country, Year, Number) %>%
group_by(Country, Year) %>%
summarise( number = sum(Number)) %>%
mutate(total = cumsum(number)) %>%
filter(Country == "United States" | Country == "Germany" | Country == "United Kingdom" | Country == "France" | Country == "Italy")
q <- ggplot(www, aes(x=Year, y=total, group=Country, color=Country)) +
geom_line() +
geom_point() +
scale_color_viridis(discrete = TRUE) +
ggtitle("Changes in the total number of medals of the top 5 countries in the previous Olympic Games") +
theme_ipsum() +
ylab("Number of the medals") +
transition_reveal(Year)
animate(q, nframes = 350,fps = 25, width = 1200, height = 1000,
renderer = gifski_renderer("yearly.gif"))
ggplot(as.data.frame(ooo),
aes(y = n, axis1 = Country, axis2 = Sport, axis3 = Event)) +
geom_alluvium(aes(fill = Country), width = 1/40) +
geom_stratum(width = 1/50, fill = "white", color = "black") +
geom_label(stat = "stratum", aes(label = after_stat(stratum)))+
scale_x_discrete(limits = c("Country", "Sport", "Event"), expand = c(.05, .05)) +
scale_fill_brewer(type = "qual", palette = "Set3") +
ggtitle("Distribution of the top 5 Countries' medals among the 5 sports with the most participants")
data<- read.csv("Data/athlete_events.csv")
clean_data <- data %>%
select (c("Sex","Age","Medal")) %>%
mutate(Sex=as.factor(Sex),
Medal= as.factor(Medal)) %>%
group_by(Sex,Age) %>%
count(Medal)
clean_data
## # A tibble: 425 x 4
## # Groups: Sex, Age [138]
## Sex Age Medal n
## <fct> <int> <fct> <int>
## 1 F 11 Silver 1
## 2 F 11 <NA> 11
## 3 F 12 Bronze 1
## 4 F 12 Silver 3
## 5 F 12 <NA> 28
## 6 F 13 Bronze 2
## 7 F 13 Gold 5
## 8 F 13 Silver 6
## 9 F 13 <NA> 138
## 10 F 14 Bronze 15
## # … with 415 more rows
clean_data1 <- clean_data %>%
mutate(age_group = cut(Age,breaks = c(0,15,30,45,60,75,90))) %>%
select(-Age) %>%
group_by(Sex,age_group) %>%
count(Medal) %>%
rename(number = "n")
## Adding missing grouping variables: `Age`
clean_data1
## # A tibble: 48 x 4
## # Groups: Sex, age_group [13]
## Sex age_group Medal number
## <fct> <fct> <fct> <int>
## 1 F (0,15] Bronze 4
## 2 F (0,15] Gold 3
## 3 F (0,15] Silver 5
## 4 F (0,15] <NA> 5
## 5 F (15,30] Bronze 15
## 6 F (15,30] Gold 15
## 7 F (15,30] Silver 15
## 8 F (15,30] <NA> 15
## 9 F (30,45] Bronze 15
## 10 F (30,45] Gold 14
## # … with 38 more rows
table1<- clean_data1 %>%
group_by (Sex,age_group) %>%
summarise(total_medal_in_age_group= sum(number))
table1
## # A tibble: 13 x 3
## # Groups: Sex [2]
## Sex age_group total_medal_in_age_group
## <fct> <fct> <int>
## 1 F (0,15] 17
## 2 F (15,30] 60
## 3 F (30,45] 59
## 4 F (45,60] 32
## 5 F (60,75] 15
## 6 F <NA> 4
## 7 M (0,15] 15
## 8 M (15,30] 60
## 9 M (30,45] 60
## 10 M (45,60] 60
## 11 M (60,75] 31
## 12 M (75,90] 6
## 13 M <NA> 6
| Sex | total_medal_by_sex | age_group | total_medal_in_age_group | Percentage |
|---|---|---|---|---|
| F | 187 | (0,15] | 17 | 9.090909 |
| F | 187 | (15,30] | 60 | 32.085561 |
| F | 187 | (30,45] | 59 | 31.550802 |
| F | 187 | (45,60] | 32 | 17.112299 |
| F | 187 | (60,75] | 15 | 8.021390 |
| F | 187 | NA | 4 | 2.139037 |
| M | 238 | (0,15] | 15 | 6.302521 |
| M | 238 | (15,30] | 60 | 25.210084 |
| M | 238 | (30,45] | 60 | 25.210084 |
| M | 238 | (45,60] | 60 | 25.210084 |
| M | 238 | (60,75] | 31 | 13.025210 |
| M | 238 | (75,90] | 6 | 2.521008 |
| M | 238 | NA | 6 | 2.521008 |
In Table 0.2 we compare the total medal won by different age groups for both female and male.
From this table, it shows that female younger athletes age range from 15- 30 has the highest percentage (32.09%) of total medal won compared to other age group, while older female athletes age from 60 to 75 has the least percentage of medal won (8.02%). However, for males athletes, it appears that age group between (15-30),(30-45),(45-60_ share the same percentage( and highest) of total medal won, while older male athletes age form 75 to 90 again has the least percentage of medal won ( 2.52%).
Figure 0.3: The percentage of total medal won in different age group by sex
In Figure 0.3, we have plotted the percentage of the total medal won by different age group and compared in against both gender of male and female.
0.3, depicts that for the age groups aged (0-15),(15-30),and (30-45), female athletes is accounted of a higher proportion of the total medal compared to males, However by the age group (45 to 60), males athletes exceed female athletes in the proportion of total medal won and continues to have a higher proportion than female in the older age bracket.
##Second question: Comparison of different medal distribution between age group by gender.
clean_data2 <- data %>%
select (c("Sex","Age","Medal")) %>%
mutate(Sex=as.factor(Sex),
Medal= as.factor(Medal)) %>%
group_by(Sex,Age) %>%
count(Medal) %>%
rename("number"=n)
clean_data2
## # A tibble: 425 x 4
## # Groups: Sex, Age [138]
## Sex Age Medal number
## <fct> <int> <fct> <int>
## 1 F 11 Silver 1
## 2 F 11 <NA> 11
## 3 F 12 Bronze 1
## 4 F 12 Silver 3
## 5 F 12 <NA> 28
## 6 F 13 Bronze 2
## 7 F 13 Gold 5
## 8 F 13 Silver 6
## 9 F 13 <NA> 138
## 10 F 14 Bronze 15
## # … with 415 more rows
gold1<- clean_data2 %>%
filter(Medal== "Gold") %>%
group_by(Sex) %>%
summarise(total_silver_medal=sum(number))
gold<- clean_data2 %>%
filter(Medal== "Gold") %>%
left_join(gold1 ,by = "Sex") %>%
mutate("Percentage" = ((number/total_silver_medal)*100))
silver1<- clean_data2 %>%
filter(Medal== "Silver") %>%
group_by(Sex) %>%
summarise(total_silver_medal=sum(number))
silver<- clean_data2 %>%
filter(Medal== "Silver") %>%
left_join(silver1 ,by = "Sex") %>%
mutate("Percentage" = ((number/total_silver_medal)*100))
bronze1<- clean_data2 %>%
filter(Medal== "Bronze") %>%
group_by(Sex) %>%
summarise(total_bronze_medal=sum(number))
bronze<- clean_data2 %>%
filter(Medal== "Bronze") %>%
left_join(bronze1 ,by = "Sex") %>%
mutate("Percentage" = ((number/total_bronze_medal)*100))
gold
## # A tibble: 89 x 6
## # Groups: Sex, Age [89]
## Sex Age Medal number total_silver_medal Percentage
## <fct> <int> <fct> <int> <int> <dbl>
## 1 F 13 Gold 5 3747 0.133
## 2 F 14 Gold 20 3747 0.534
## 3 F 15 Gold 66 3747 1.76
## 4 F 16 Gold 103 3747 2.75
## 5 F 17 Gold 133 3747 3.55
## 6 F 18 Gold 160 3747 4.27
## 7 F 19 Gold 177 3747 4.72
## 8 F 20 Gold 188 3747 5.02
## 9 F 21 Gold 265 3747 7.07
## 10 F 22 Gold 310 3747 8.27
## # … with 79 more rows
gold_plot<- ggplot(data=gold, aes( x= Age,
y= Percentage,
fill= Sex)) +
geom_density (alpha = 0.4, stat = "identity", position = "identity")+
ggtitle("The distribution of gold medal won by different age group for female and male")
silver
## # A tibble: 100 x 6
## # Groups: Sex, Age [100]
## Sex Age Medal number total_silver_medal Percentage
## <fct> <int> <fct> <int> <int> <dbl>
## 1 F 11 Silver 1 3735 0.0268
## 2 F 12 Silver 3 3735 0.0803
## 3 F 13 Silver 6 3735 0.161
## 4 F 14 Silver 25 3735 0.669
## 5 F 15 Silver 59 3735 1.58
## 6 F 16 Silver 101 3735 2.70
## 7 F 17 Silver 100 3735 2.68
## 8 F 18 Silver 158 3735 4.23
## 9 F 19 Silver 179 3735 4.79
## 10 F 20 Silver 205 3735 5.49
## # … with 90 more rows
silver_plot<- ggplot(data=silver, aes( Age,
y= Percentage,
fill= Sex)) +
geom_density (alpha = 0.4, stat = "identity", position = "identity")+
ggtitle("The distribution of silver medal won by different age group for female and male ")
bronze
## # A tibble: 99 x 6
## # Groups: Sex, Age [99]
## Sex Age Medal number total_bronze_medal Percentage
## <fct> <int> <fct> <int> <int> <dbl>
## 1 F 12 Bronze 1 3771 0.0265
## 2 F 13 Bronze 2 3771 0.0530
## 3 F 14 Bronze 15 3771 0.398
## 4 F 15 Bronze 51 3771 1.35
## 5 F 16 Bronze 86 3771 2.28
## 6 F 17 Bronze 110 3771 2.92
## 7 F 18 Bronze 139 3771 3.69
## 8 F 19 Bronze 167 3771 4.43
## 9 F 20 Bronze 216 3771 5.73
## 10 F 21 Bronze 265 3771 7.03
## # … with 89 more rows
bronze_plot<- ggplot(data=bronze, aes(x=Age,
y= Percentage,
fill= Sex)) +
geom_density (alpha = 0.4, stat = "identity", position = "identity")+
ggtitle("The distribution of bronze medal won by different age group for female and male ")
grid.arrange(gold_plot,silver_plot,bronze_plot, ncol=2)
Figure 0.4: The medal distribution won in different age group by sex
n Figure 0.4, the different medal distribution was plotted for different age and compared in against both gender of male and female.
From this plot. the gold medal distribution for female and male are positively skewed with the athletes in the younger age group accounting for more of the gold medals earned than the older athletes (specifically, both female and male athletes age in early 20s have the highest percentage of the gold medal won).Similar result could be seen for silver medal distribution, both distribution for female and male athlete are positively skewed, however we can see that the age group that account the most percentage of silver medal won is ranged from 20-30 years older for both female and male athletes. In the bronze medal distribution, it shares similar distribution as silver medal as athletes age from 20 to 30 is account for the most percentage of bronze medal won for both female and male. Additionally, it is seen that for all medal distribution, female athletes of younger age (0-20) tend to account for higher percentage of medal (gold, silver, and bronze) than male athletes, however by age 30 and over, male athletes exceed female athletes in medal won for gold, silver and bronze medal. This could be due the quicker fall of physical, technical and strategics abilities of females athletes 30 and over , companied with increasing social pressure that female of an older age to be more family orientated.
Hence from the above analysis, we conclude that in general for both gender, age 20 to 30 tends to account for most proportions of the medals (gold, silver, and bronze)won, furthermore, it was found that female athletes tends to do better than male for medal won before age20, however later was exceed by male after age 30.
In summary, from the presentation today, we conclude that for quesiton1 when comparing the distribution of medal in different country aound the world, America appears to have the most medals in the world. In addtional, for question two, we have found that the sport athletics have the largest number of athletics, and in this sports, The United States accumulated the most gold MEDALS in each year. Furthermore for question three, we conclude that The United States has an absolute advantage in swimming events,The medals of United Kingdom are more distributed in Cycling. and that the Medals from other countries are very evenly distributed among the five events lastly. when comparing the medals won for different age group by gender, it was revealed that in general for both gender, age bracket from 20-40 and 40 to 60 in general has the most proporiton of medal won compared to other age group for gold, silver and bronze medal won.